data <- read.csv('elevation_villages_correct.csv')
data %>%
group_by(language) %>%
slice(which.min(elevation)) %>%
select(eng_vil_name, elevation) %>%
write.csv('minimum_vil.csv')
## Adding missing grouping variables: `language`
Average elevation across different languages (grey dots represent different villages:)
ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
theme_bw()+
# stat_summary(fun.data = mean_se,
# geom = "errorbar", color='black', alpha=0.3)+
geom_jitter(color='grey', alpha=0.1)+
stat_summary(aes(color=Family), fun = "mean", geom = "point", size=2)+
labs(y = "Elevation", x='Language', color='Family')+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
theme(legend.position = "top")+
scale_color_viridis(discrete = TRUE, option = "D")+
coord_flip()+
theme(axis.text.x=element_text(angle=0, hjust=0.5),
# text = element_text(family = "Andale Mono"),
legend.position = 'bottom')+
geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')+
guides(colour = guide_legend(nrow = 1))
ggsave('elev_lang.jpg', width = 8, height = 5)
How the population size correlates with elevation? Well, there seems to be a slight decrease with higher elevations (although we cannot trust those regressions).
data$log_c <- 0
data$log_c <- log(data$census_1926)
data$el_100 <- plyr::round_any(data$elevation, 1, f = ceiling)
data %>%
group_by(el_100) %>%
summarise(mean_c = mean(log_c)) %>%
ggplot(aes(x=el_100, y=mean_c))+
geom_point(alpha=0.4)+
geom_smooth(method = 'lm', color='red')+
theme_bw()+
labs(y = "Mean population size", x='Elevation')
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
theme_bw()+
# geom_boxplot(aes(fill=Family))+
geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 60, method = 'histodot')+
facet_wrap(vars(Family), scales = "free_y", strip.position = 'top')+
labs(y = "Elevation", x='Language', size=' 1926 Census\n data')+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
theme(legend.position = "top")+
scale_color_viridis(discrete = TRUE, option = "D")+
coord_flip()+
theme(legend.position = "none",
legend.key = element_blank(),
strip.text = element_text(face="bold"),
axis.text.x=element_text(angle=0, hjust=0.5))
ggsave('elev_lang.jpg', width = 8, height = 5)
data[data$language == 'Godoberi ',] %>%
write.csv('godoberi.csv')
ggplot(data, aes(y=elevation, x=Family))+
theme_bw()+
geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 41, method = 'histodot')+
theme(legend.position = "none",
legend.key = element_blank(),
strip.text = element_text(face="bold"),
axis.text.x=element_text(angle=0, hjust=0.5))+
# stat_summary(fun = "mean", geom = "crossbar", size=0.5, alpha=0.1, color='grey')+
# geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')
ggsave('family_dist.jpg', width = 8, height = 5)
all <- read.csv('all.csv')
kable(head(all))
| X | residence | expedition | name | number.of.na | russian.na | sex | type | year_of_birth | year.of.death | аварский | агульский | азербайджанский | азербайджанский.или.кумыкский | акушинский.даргинский | андийский | арчинский | ахвахский | багвалинский | бежтинский | ботлихский | гапшиминский.даргинский | гдымско.фийский.лезгинский | гинухский | грузинский | даргинский | кадарский.даргинский | кайтагский.даргинский | каратинский | кубачинский.даргинский | кумыкский | лакский | лезгинский | мегебский | муиринский.даргинский | русский | рутульский | сирхинский.даргинский | табасаранский | тукитинский | хновский.рутульский | цахурский | цезский | цудахарский.даргинский | чеченский | чирагский.даргинский | mother.tongue | village.population | language.population | number.of.lang | mother.tongue.match | number.of.lang.strat | elevation |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Иса | 0 | 0 | м | 0 | 1885 | 1965 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 |
| 1 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Бет1и | 0 | 0 | ж | 0 | 1888 | 1978 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 |
| 2 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Абдула | 0 | 0 | м | 0 | 1890 | 1972 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 2 | 1 | 2 | 1658.484 |
| 3 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Патимат | 0 | 0 | ж | 0 | 1890 | 1985 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 |
| 4 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Исмаил | 0 | 0 | м | 0 | 1890 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 2 | 1 | 2 | 1658.484 |
| 5 | Balkhar | Balkhar, Tsulikana, Shukty, Kuli | Жумяъ/Маи | 0 | 0 | ж | 0 | 1890 | 1967 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 |
ggplot(data = all, aes(y=elevation, x=factor(number.of.lang.strat)))+
theme_bw()+
geom_violin(fill='grey', color='grey', trim = TRUE)
fit_p <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, family="poisson", data=all)
fit_lin <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, data=all)
Linear:
ggplot(ggpredict(fit_lin, terms = c('elevation')), aes(x, predicted)) +
geom_line()+
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
theme_bw()
# facet_wrap(~facet)
Poisson:
ggplot(ggpredict(fit_p, terms = c('elevation')), aes(x, predicted)) +
geom_line()+
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
theme_bw()
# facet_wrap(~facet)
ITM <- read.csv('ITM.csv')
all <- read.csv('all.csv')
all_subset <- all[all$year_of_birth < 1930,]
all_subset$norm_yb <- scale(all_subset$year_of_birth)
all_subset$norm_el <- scale(all_subset$elevation)
all_subset$norm_itm <- scale(all_subset$number.of.lang.strat)
all_subset$norm_pop <- scale(all_subset$language.population)
all$norm_yb <- scale(all$year_of_birth)
all$norm_el <- scale(all$elevation)
all$norm_itm <- scale(all$number.of.lang.strat)
all$norm_pop <- scale(all$language.population)
all$norm_vil_pop <- scale(all$village.population)
pois_2 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=all_subset, family='poisson')
summary(pois_2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
## Data: all_subset
##
## AIC BIC logLik deviance df.resid
## 4301.6 4323.8 -2146.8 4293.6 1921
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4442 -0.4142 -0.0894 0.2961 3.6068
##
## Random effects:
## Groups Name Variance Std.Dev.
## norm_yb:sex (Intercept) 0.0134 0.1157
## residence:mother.tongue (Intercept) 0.5030 0.7092
## Number of obs: 1925, groups: norm_yb:sex, 139; residence:mother.tongue, 50
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2453 0.1073 -2.286 0.0223 *
## norm_el 0.3285 0.1068 3.076 0.0021 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## norm_el -0.017
Garik M. model:
garik_model <- glmer(number.of.lang.strat ~ sex + norm_pop + (1|norm_yb) + (1|residence), data=all, family='poisson')
summary(garik_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: number.of.lang.strat ~ sex + norm_pop + (1 | norm_yb) + (1 |
## residence)
## Data: all
##
## AIC BIC logLik deviance df.resid
## 12671.9 12705.8 -6330.9 12661.9 6473
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2651 -0.5178 -0.1423 0.3334 4.8590
##
## Random effects:
## Groups Name Variance Std.Dev.
## norm_yb (Intercept) 0.1061 0.3257
## residence (Intercept) 0.7807 0.8836
## Number of obs: 6478, groups: norm_yb, 158; residence, 50
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.75257 0.13259 -5.676 1.38e-08 ***
## sexм 0.24847 0.02819 8.814 < 2e-16 ***
## norm_pop -0.29958 0.11690 -2.563 0.0104 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexм
## sexм -0.124
## norm_pop -0.080 -0.001
Normalized elevation vs. normalized village population.
ggplot(data=all, aes(x=norm_el, y=norm_vil_pop))+
geom_point()
pois_2_pop <- glmer(number.of.lang ~ norm_el + (1|norm_yb) + (1|sex) + (1|residence) + norm_pop + norm_vil_pop, data=all, family='poisson')
summary(pois_2_pop)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## number.of.lang ~ norm_el + (1 | norm_yb) + (1 | sex) + (1 | residence) +
## norm_pop + norm_vil_pop
## Data: all
##
## AIC BIC logLik deviance df.resid
## 12659.3 12706.8 -6322.7 12645.3 6471
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2613 -0.5227 -0.1316 0.3337 4.8098
##
## Random effects:
## Groups Name Variance Std.Dev.
## norm_yb (Intercept) 0.10649 0.3263
## residence (Intercept) 0.47514 0.6893
## sex (Intercept) 0.02003 0.1415
## Number of obs: 6478, groups: norm_yb, 158; residence, 50; sex, 2
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.67170 0.14619 -4.595 4.33e-06 ***
## norm_el 0.28415 0.10318 2.754 0.00589 **
## norm_pop -0.30435 0.09433 -3.227 0.00125 **
## norm_vil_pop -0.50435 0.11194 -4.506 6.62e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) norm_l nrm_pp
## norm_el -0.034
## norm_pop -0.041 0.138
## norm_vil_pp 0.094 0.095 0.125
ITM$norm_yb <- scale(ITM$year_of_birth)
ITM$norm_el <- scale(ITM$elevation)
ITM$norm_itm <- scale(ITM$number.of.lang.strat)
pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
## Data: ITM
##
## AIC BIC logLik deviance df.resid
## 8494.0 8519.5 -4243.0 8486.0 4323
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2029 -0.4526 -0.1267 0.2698 3.8511
##
## Random effects:
## Groups Name Variance Std.Dev.
## norm_yb:sex (Intercept) 0.04944 0.2223
## residence:mother.tongue (Intercept) 0.74412 0.8626
## Number of obs: 4327, groups: norm_yb:sex, 118; residence:mother.tongue, 50
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6311 0.1281 -4.928 8.33e-07 ***
## norm_el 0.3873 0.1281 3.023 0.00251 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## norm_el -0.059
pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
## Data: ITM
##
## AIC BIC logLik deviance df.resid
## 8494.0 8519.5 -4243.0 8486.0 4323
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2029 -0.4526 -0.1267 0.2698 3.8511
##
## Random effects:
## Groups Name Variance Std.Dev.
## norm_yb:sex (Intercept) 0.04944 0.2223
## residence:mother.tongue (Intercept) 0.74412 0.8626
## Number of obs: 4327, groups: norm_yb:sex, 118; residence:mother.tongue, 50
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6311 0.1281 -4.928 8.33e-07 ***
## norm_el 0.3873 0.1281 3.023 0.00251 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## norm_el -0.059
AIC(pois_1)
## [1] 8493.972
ITM_el <- dplyr::select(ITM, elevation, number.of.lang.strat) %>%
group_by(elevation) %>%
summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
kable(head(ITM_el))
| elevation | mean_itm | count |
|---|---|---|
| 355.0269 | 0.0000000 | 94 |
| 365.5033 | 1.0689655 | 145 |
| 407.0000 | 0.0973451 | 113 |
| 450.7447 | 0.5662651 | 83 |
| 491.3748 | 0.8709677 | 31 |
| 667.3964 | 0.0000000 | 66 |
pred <- ggpredict(pois_1, terms = c('norm_el'))
true_sequence <- seq(min(ITM$elevation), max(ITM$elevation), length.out=21)
ggplot()+
geom_point(data=ITM_el, aes(x=elevation, y=mean_itm, size=count), color='blue', alpha=.5)+
geom_line(data=pred, aes(x=true_sequence, y=predicted), size=0.8)+
geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=true_sequence), alpha = .15)+
theme_bw()+
labs(x = "Elevation", y='Average ITM', size=' Number\n of observations')
ggsave('poison_observ.jpg', width = 7, height = 4)
Hypothesis: isolation is not the primary source of language diversity.
all$rounded.yb <- plyr::round_any(all$year_of_birth, 5, f = ceiling)
L2 <- all[all$mother.tongue != 'рутульский' & all$'рутульский' == 1,] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(рутульский), count = n(), itm = mean(number.of.lang))
L1 <- all[all$mother.tongue == 'рутульский',] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(рутульский), count = n(), itm = mean(number.of.lang))
ggplot()+
theme_bw()+
geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
ggtitle('Rutul knowledge as L1 or L2')
L2 <- all[all$mother.tongue != 'аварский' & all$'аварский' == 1,] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(аварский), count = n(), itm = mean(number.of.lang))
L1 <- all[all$mother.tongue == 'аварский',] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(аварский), count = n(), itm = mean(number.of.lang))
ggplot()+
theme_bw()+
geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
ggtitle('Avar knowledge as L1 or L2')
L2 <- all[all$mother.tongue != 'лакский' & all$'лакский' == 1,] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang))
L1 <- all[all$mother.tongue == 'лакский',] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang))
ggplot()+
theme_bw()+
geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
ggtitle('Lak knowledge as L1 or L2')
L2 <- all[all$mother.tongue != 'азербайджанский' & all$'азербайджанский' == 1,] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(азербайджанский), count = n(), itm = mean(number.of.lang))
L1 <- all[all$mother.tongue == 'азербайджанский',] %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang))
ggplot()+
theme_bw()+
geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
ggtitle('Azerbaijani knowledge as L1 or L2')
all %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(рутульский), count = n()) %>%
ggplot(aes(x=rounded.yb, y=mean_itm))+
geom_point(aes(size=count))+
geom_smooth(aes(weight = count), method=loess, color='red')+
theme_bw()+
labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'
all %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(аварский), count = n()) %>%
ggplot(aes(x=rounded.yb, y=mean_itm))+
geom_point(aes(size=count))+
geom_smooth(aes(weight = count), method=loess, color='red')+
theme_bw()+
labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'
all %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(лакский), count = n()) %>%
ggplot(aes(x=rounded.yb, y=mean_itm))+
geom_point(aes(size=count))+
geom_smooth(aes(weight = count), method=loess, color='red')+
theme_bw()+
labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'
all %>%
group_by(rounded.yb) %>%
mutate(mean_itm = mean(сирхинский.даргинский), count = n()) %>%
ggplot(aes(x=rounded.yb, y=mean_itm))+
geom_point(aes(size=count))+
geom_smooth(aes(weight = count), method=loess, color='red')+
theme_bw()+
labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'
all %>%
group_by(mother.tongue) %>%
summarise(count = n()) %>%
arrange(count, desc = FALSE)
## `summarise()` ungrouping output (override with `.groups` argument)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat
## # A tibble: 29 x 2
## mother.tongue count
## <fct> <int>
## 1 муиринский даргинский 49
## 2 цудахарский даргинский 93
## 3 кумыкский 97
## 4 чирагский даргинский 101
## 5 гапшиминский даргинский 104
## 6 табасаранский 111
## 7 ахвахский 112
## 8 лезгинский 112
## 9 гинухский 154
## 10 гдымско-фийский лезгинский 157
## # … with 19 more rows
itm_coors <- read.csv('itm_coords_no_Hinuq.csv')
itm_coors <- itm_coors[complete.cases(itm_coors), ]
kable(head(itm_coors))
| X | index | Unnamed..0 | expedition | name | number.of.na | russian.na | sex | type | year_of_birth | year.of.death | аварский | агульский | азербайджанский | азербайджанский.или.кумыкский | акушинский.даргинский | андийский | арчинский | ахвахский | багвалинский | бежтинский | ботлихский | гапшиминский.даргинский | гдымско.фийский.лезгинский | гинухский | грузинский | даргинский | кадарский.даргинский | кайтагский.даргинский | каратинский | кубачинский.даргинский | кумыкский | лакский | лезгинский | мегебский | муиринский.даргинский | русский | рутульский | сирхинский.даргинский | табасаранский | тукитинский | хновский.рутульский | цахурский | цезский | цудахарский.даргинский | чеченский | чирагский.даргинский | mother.tongue | village.population | language.population | number.of.lang | mother.tongue.match | number.of.lang.strat | elevation | Lat | Lon |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Balkhar | 0 | Balkhar, Tsulikana, Shukty, Kuli | Абедет | 0 | 0 | ж | 0 | 1922 | 2000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 | 42.2797 | 47.2314 |
| 1 | Balkhar | 1 | Balkhar, Tsulikana, Shukty, Kuli | Абдурахман | 0 | 0 | м | 0 | 1923 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 | 42.2797 | 47.2314 |
| 2 | Balkhar | 2 | Balkhar, Tsulikana, Shukty, Kuli | Магомед | 0 | 0 | м | 0 | 1923 | 1995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 | 42.2797 | 47.2314 |
| 3 | Balkhar | 3 | Balkhar, Tsulikana, Shukty, Kuli | Жума | 0 | 0 | ж | 0 | 1924 | 1984 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 | 42.2797 | 47.2314 |
| 4 | Balkhar | 4 | Balkhar, Tsulikana, Shukty, Kuli | Шаммадаев Калла | 0 | 0 | м | 0 | 1924 | 2004 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 2 | 1 | 2 | 1658.484 | 42.2797 | 47.2314 |
| 5 | Balkhar | 5 | Balkhar, Tsulikana, Shukty, Kuli | Гусейн | 0 | 0 | м | 0 | 1925 | 2010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | лакский | 1195 | 31987 | 0 | 1 | 0 | 1658.484 | 42.2797 | 47.2314 |
itm_vill <- itm_coors %>%
group_by(index, Lat, Lon, elevation) %>%
summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon' (override with `.groups` argument)
kable(head(itm_vill))
| index | Lat | Lon | elevation | mean_itm | count |
|---|---|---|---|---|---|
| Balkhar | 42.27970 | 47.23140 | 1658.4839 | 0.3296703 | 91 |
| Bezhta | 42.13390 | 46.12470 | 1633.0813 | 1.3636364 | 143 |
| Burkikhan | 41.80636 | 47.53847 | 1917.4192 | 0.7826087 | 92 |
| Chabanmakhi | 42.63784 | 47.25923 | 1020.0406 | 0.6400000 | 75 |
| Chankurbe | 42.63047 | 47.30022 | 882.7377 | 0.5652174 | 92 |
| Chirag | 41.83810 | 47.43030 | 2223.1909 | 0.9718310 | 71 |
matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix < 19] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
itm_vill$knn_dist <- knn(net)$knn
itm_vill$degree <- degree(net)
ggplot(itm_vill, aes(x=factor(degree), y=mean_itm, label=index))+
theme_bw()+
geom_boxplot()
itm_vill %>%
group_by(degree) %>%
summarise(mean = mean(mean_itm), sd = sd(mean_itm),
n = n()) %>%
mutate(se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se) %>%
ggplot(aes(x=factor(degree), y=mean))+
geom_point()+
geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN
itm_vill %>%
group_by(degree) %>%
summarise(mean = mean(mean_itm), sd = sd(mean_itm),
n = n()) %>%
mutate(se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se) %>%
ggplot(aes(x=factor(degree), y=mean))+
geom_point()+
geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN
The villages that are closer than 12km to each-other are interconnected with mean itm and elevation plotted for each village.
matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
net %>%
ggraph(layout = 'kk')+
geom_edge_link(color = "orange")+
theme_void()+
geom_node_point(aes(size=itm_vill$mean_itm, color=itm_vill$elevation)) +
geom_node_text(size = 4,
aes(label = name), repel=TRUE)+
labs(size='Mean ITM', color='Elevation')
ggsave('village_graph.jpg', width = 7, height = 4)
Same plot, but mapped to the actual geospatial data.
matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
ggraph(graph=net, x=itm_vill$Lon, y=itm_vill$Lat)+
theme_void()+
geom_edge_link(color = "orange")+
geom_node_point(aes(color=itm_vill$mean_itm)) +
geom_node_text(size = 4,
aes(label = name), repel=TRUE)+
labs(size='Mean ITM', color='Mean ITM')+
annotation_map(map_data("world"), fill = NA, colour = "grey50")
ggsave('village_graph_map.jpg', width = 7, height = 4)
data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
All the villages from the bigger dataset interconnected if they are closer than 6kms. Colored by family.
matrix_fam[matrix_fam > 6] <- 0
rownames(matrix_fam) <- data_short$eng_vil_name
colnames(matrix_fam) <- data_short$eng_vil_name
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)
net_fam %>%
# ggraph(layout = 'kk')+
ggraph(x=data_short$Lon, y=data_short$Lat)+
geom_edge_link(color = "orange")+
theme_void()+
geom_node_point(aes(color=data_short$Family), size=0.5)+
labs(color='Family')
# geom_node_text(aes(label = name), repel=TRUE)+
# ggsave('village_graph_map.pdf', width = 30, height = 30)
The number of neighbours in the graph above with regards to the elevation of the village (elevation is rounded to 100-s). Seems to be no effect.
data$neighbours_6 <- degree(net_fam)
data$log_c <- log(data$census_1926)
data$elevation_round <- plyr::round_any(data$elevation, 100, f = ceiling)
data %>%
group_by(elevation_round) %>%
summarise(neighbours = mean(neighbours_6), n_6 = neighbours_6, size=n(), pop = census_1926) %>%
ggplot(aes(x=elevation_round, y=neighbours))+
geom_line()+
geom_point(aes(y=n_6, size=pop), alpha=0.1, color='blue')+
theme_bw()
## `summarise()` regrouping output by 'elevation_round' (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).
Same graph as with bigger dataset, but with multidag data.
by_lang <- data %>%
group_by(language, Family) %>%
summarise(Lon = mean(Lon), Lat = mean(Lat), n = n())
## `summarise()` regrouping output by 'language' (override with `.groups` argument)
m_l <- distm(by_lang[,c('Lon','Lat')], by_lang[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
m_l[m_l > 25] <- 0
rownames(m_l) <- by_lang$language
colnames(m_l) <- by_lang$language
net_l <- graph.adjacency(m_l, mode = "undirected", weighted = TRUE, diag = FALSE)
net_l %>%
ggraph(layout = 'kk')+
geom_edge_link(color = "orange")+
theme_void()+
geom_node_point(aes(color=by_lang$Family))+
geom_node_text(size = 2,
aes(label = name), repel=TRUE)+
labs(color='Family')
How many members of different families are around each village?
data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix_fam[matrix_fam > 100] <- 0
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)
V(net_fam)$label <- data$Family
SameLabel <- function(e) {
V(net_fam)[ends(net_fam, e)[1]]$label == V(net_fam)[ends(net_fam, e)[2]]$label }
g2 <- delete_edges(net_fam, which(sapply(E(net_fam), SameLabel)))
data_short$degree <- degree(g2)
ggplot(data=data_short, aes(x=Family, y=degree))+
geom_boxplot()
# geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 0.45)
# ggsave('degree_family.jpg', width = 7, height = 4)
data_short %>%
group_by(Family) %>%
summarise(mean = mean(degree))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
## Family mean
## <fct> <dbl>
## 1 Andic 487.
## 2 Avar 409.
## 3 Dargwa 586.
## 4 Lak 839.
## 5 Lezgic 340.
## 6 Tsezic 405.
## 7 Turkic 589.
data_itm_fam <- read.csv('itm_coords_Family.csv')
itm_vill_fam <- data_itm_fam %>%
group_by(index, Lat, Lon, elevation, Family, mother.tongue) %>%
summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon', 'elevation', 'Family' (override with `.groups` argument)
itm_vill_fam <- itm_vill_fam[!itm_vill_fam$index == 'Hinuq',]
itm_vill_fam[itm_vill_fam$mother.tongue == 'кадарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'сирхинский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цудахарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'гдымско-фийский лезгинский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цахурский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'азербайджанский',]$Family <- 'Turkic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'ахвахский',]$Family <- 'Andic'
itm_vill_fam$Lat <- itm_vill$Lat
itm_vill_fam$Lon <- itm_vill$Lon
Villages that are closer to each other than 25kn and dont share the same mother tongue:
matrix <- distm(itm_vill_fam[,c('Lon','Lat')], itm_vill_fam[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 25] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill_fam$index
colnames(matrix) <- itm_vill_fam$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
# V(net)$label <- itm_vill_fam$Family
V(net)$label <- itm_vill_fam$mother.tongue
SameLabel <- function(e) {
V(net)[ends(net, e)[1]]$label == V(net)[ends(net, e)[2]]$label }
g2 <- delete_edges(net, which(sapply(E(net), SameLabel)))
itm_vill_fam$deg <- degree(g2)
# DiffLabel <- function(e) {
# V(net)[ends(net, e)[1]]$label != V(net)[ends(net, e)[2]]$label }
# g3 <- delete_edges(net, which(sapply(E(net), DiffLabel)))
#
itm_vill_fam$deg <- degree(g2)
# itm_vill_fam$deg_tot <- degree(g3)
# itm_vill_fam$deg <- log(itm_vill_fam$deg_n/itm_vill_fam$deg_tot)
g2 %>%
ggraph(layout = 'kk')+
geom_edge_link(color = "orange", alpha=0.5)+
theme_void()+
geom_node_point(aes(color=itm_vill_fam$Family))+
scale_color_brewer(palette="Dark2")+
labs(color='Family')
# ggsave('village_graph.jpg', width = 7, height = 4)
ggplot(itm_vill_fam, aes(y=mean_itm, x=deg))+
geom_point()
ggplot(itm_vill_fam, aes(x=Family, y=deg))+
geom_boxplot()+
labs(x='Degree')
itm_deg_m <- lmer(mean_itm ~ deg + (1|Family:mother.tongue) + count, itm_vill_fam)
summary(itm_deg_m)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ deg + (1 | Family:mother.tongue) + count
## Data: itm_vill_fam
##
## REML criterion at convergence: 54.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.25696 -0.55455 0.05043 0.40487 2.47483
##
## Random effects:
## Groups Name Variance Std.Dev.
## Family:mother.tongue (Intercept) 0.12227 0.3497
## Residual 0.05217 0.2284
## Number of obs: 49, groups: Family:mother.tongue, 28
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.519597 0.175582 45.905846 2.959 0.00486 **
## deg 0.019170 0.023068 45.390793 0.831 0.41032
## count 0.001549 0.001098 35.404594 1.412 0.16682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) deg
## deg -0.676
## count -0.679 0.128
pred <- ggpredict(itm_deg_m, terms = c('deg'))
ggplot()+
geom_point(data=itm_vill_fam, aes(y=mean_itm, x=deg, color=Family))+
geom_text_repel(data=itm_vill_fam, aes(label=index, y=mean_itm, x=deg))+
geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
scale_color_brewer(palette="Dark2")
It seems that using the number of neighbouring villages that are speaking different languages from the given village we can predict ITM! Here the data is extracted from the bigger dataset:
overlap <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]$eng_vil_name
ov_itm_vill_fam <- itm_vill_fam[itm_vill_fam$index %in% overlap,]
data_all <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]
itm_vill_fam_m <- merge(x= ov_itm_vill_fam, y = data_all[, c('eng_vil_name', 'degree')], by.x='index', by.y = 'eng_vil_name', all=TRUE, copy=TRUE)
itm_deg <- lmer(mean_itm ~ degree + (1|Family:mother.tongue) + count, itm_vill_fam_m)
summary(itm_deg)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ degree + (1 | Family:mother.tongue) + count
## Data: itm_vill_fam_m
##
## REML criterion at convergence: 60.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7488 -0.5158 0.2256 0.5314 1.9616
##
## Random effects:
## Groups Name Variance Std.Dev.
## Family:mother.tongue (Intercept) 0.06150 0.2480
## Residual 0.09047 0.3008
## Number of obs: 41, groups: Family:mother.tongue, 25
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.0701453 0.2604493 28.9692203 4.109 0.000298 ***
## degree -0.0009389 0.0004361 23.1926888 -2.153 0.041951 *
## count 0.0016051 0.0012567 35.5238115 1.277 0.209824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) degree
## degree -0.841
## count -0.442 -0.026
pred <- ggpredict(itm_deg, terms = c('degree'))
ggplot()+
geom_point(data=itm_vill_fam_m, aes(y=mean_itm, x=degree, color=Family))+
geom_text_repel(data=itm_vill_fam_m, aes(label=index, y=mean_itm, x=degree))+
geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
scale_color_brewer(palette="Dark2")